House Prices - à la Kaggle

Ask a home buyer to describe their dream house, and they probably won’t begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition’s dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence.
With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this competition challenges you to predict the final price of each home.

Deliverables:
Your group is to turn in a paper that is no more that 7 pages long (without the appendix). Please put your code in the appendix, but any graphs and tables in the body of the paper.

Sample Format
Required deliverables in the complete report. The format of your paper (headers, sections, etc) is flexible although should contain the following information:

Introduction

Data Description
(Where did the data come from? How big is it? How many observations? Where can we find out more? What are the specific variables that we need to know to understand with respect to your analysis?)

Analysis Question 1:
Restatement of Problem

Specify the Model  
   
Checking Assumptions   
    Residual Plots   
    Influential point analysis (Cook’s D and Leverage)  
    Make sure and address each assumption.  

Comparing Competing Models  
    adj R2    
    Interval CVPress    
  
Parameter Interpretation  
    Interpretation   
    Confidence Intervals   

Conclusion
A short summary of the analysis.

Analysis Question 2
Restatement of Problem

Model Selection
Type of Selection
Stepwise
Forward
Backward
CUSTOM

    Checking Assumptions   
        Residual Plots  
        Influential point analysis (Cook’s D and Leverage)  
        Make sure and address each assumption  

    Comparing Competing Models  
        Adj R2     
        Interval CVPress     
        Kaggle Score   

    Conclusion: A short summary of the analysis.    
  

Appendix
Well commented SAS Code for Analysis 1 and 2

Rubric:
Presentation (30%):
Organized paper with title, headings, subheadings, etc.
Labeled plots, figures, tables and charts.
Every plot, figure, table and chart included is referenced in the paper and vice versa.
No spelling or grammatical errors.
Analysis Question 1: (35%)
Analysis Question 2: (35 %)

    setwd(data_dir)
    homes <- read.csv("train.csv", stringsAsFactors = FALSE)
    setwd(home_dir)

    names(homes) <- tolower(names(homes))
    
    for (i in 2:(length(homes)))
    {
        if (class(homes[,i]) == "character")
        {
            homes[,i] <- factor (homes[,i])
        }
    }
    

# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...   remove outliers ... more than 5 sigma from mean value
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    
    lst <- length(homes) - 1    # sale price is (currently) last column
    
    for (i in 2 : lst)
    {
        if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric")
        {
            homes[,i][which(scale(homes[,i]) > 5)] <- NA
            homes[,i][which(scale(homes[,i]) < -5)] <- NA
        }
    }

# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...   create a few new columns
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

    dates <- paste(homes$yrsold, sprintf("%02d", homes$mosold), "01")
    homes$sale_date <- as.Date(dates, "%Y %m %d")
    
    homes$total_baths <- homes$fullbath + homes$halfbath / 2.0

# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...   scale each column independently
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

#   for (i in 2 : length(homes))
#   {
#       if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric")
#       {
#           homes[,i] <- scale(homes[,i])
#       }
#   }

# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...   make some plots for numberic variables... linear, log_x, log_y, log_xy ...
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

#   pdf ("homes_train_plots.pdf", width = 10, height = 7)

    par (mfrow = c (2, 3))
    for (i in 2:(length(homes)))
    {
        if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric" || class(homes[,i]) == "matrix")
        {
            plot (homes[,i], main = (names(homes[i])))
            hist(homes[,i])
            plot(log(homes$saleprice)  ~ homes[,i])
        }
    }

    par (mfrow = c (1, 1))
    for (i in 2:(length(homes)))
    {
        if(class(homes[,i]) == "factor")
        {
            p <- ggplot(homes, aes(x = homes[,i], y = log(saleprice), fill = homes[,i])) + geom_boxplot()
            p + ggtitle(names(homes[i]))
            print(p)
        }
    }   

            plot(homes$saleprice ~ homes$sale_date)

#   dev.off()
    
    for (i in 2:(length(homes)))
    {
        if(class(homes[,i]) == "integer" || class(homes[,i]) == "numeric" || class(homes[,i]) == "matrix")
        {
            fit <- lm(homes$saleprice ~ homes[,i])
            
            print(sprintf(" ... %3d : %20s | r^2 = %8.3f | p-value = %12.4e",
                          i, names(homes[i]), summary(fit)$r.squared, summary(fit)$coefficients[,4][2] ))
        }
    }
## [1] " ...   2 :           mssubclass | r^2 =    0.007 | p-value =   1.2665e-03"
## [1] " ...   4 :          lotfrontage | r^2 =    0.145 | p-value =   9.6021e-43"
## [1] " ...   5 :              lotarea | r^2 =    0.140 | p-value =   1.1041e-49"
## [1] " ...  18 :          overallqual | r^2 =    0.626 | p-value =  2.1857e-313"
## [1] " ...  19 :          overallcond | r^2 =    0.006 | p-value =   2.9124e-03"
## [1] " ...  20 :            yearbuilt | r^2 =    0.273 | p-value =  2.9902e-103"
## [1] " ...  21 :         yearremodadd | r^2 =    0.257 | p-value =   3.1649e-96"
## [1] " ...  27 :           masvnrarea | r^2 =    0.214 | p-value =   2.1244e-77"
## [1] " ...  35 :           bsmtfinsf1 | r^2 =    0.166 | p-value =   2.4724e-59"
## [1] " ...  37 :           bsmtfinsf2 | r^2 =    0.003 | p-value =   4.8072e-02"
## [1] " ...  38 :            bsmtunfsf | r^2 =    0.046 | p-value =   1.1830e-16"
## [1] " ...  39 :          totalbsmtsf | r^2 =    0.417 | p-value =  6.3610e-173"
## [1] " ...  44 :            x1stflrsf | r^2 =    0.395 | p-value =  6.7032e-161"
## [1] " ...  45 :            x2ndflrsf | r^2 =    0.102 | p-value =   5.7643e-36"
## [1] " ...  46 :         lowqualfinsf | r^2 =    0.003 | p-value =   2.7800e-02"
## [1] " ...  47 :            grlivarea | r^2 =    0.519 | p-value =  1.9399e-233"
## [1] " ...  48 :         bsmtfullbath | r^2 =    0.052 | p-value =   1.5503e-18"
## [1] " ...  49 :         bsmthalfbath | r^2 =    0.000 | p-value =   5.7466e-01"
## [1] " ...  50 :             fullbath | r^2 =    0.314 | p-value =  1.2365e-121"
## [1] " ...  51 :             halfbath | r^2 =    0.081 | p-value =   1.6505e-28"
## [1] " ...  52 :         bedroomabvgr | r^2 =    0.029 | p-value =   7.2242e-11"
## [1] " ...  53 :         kitchenabvgr | r^2 =    0.018 | p-value =   1.9184e-07"
## [1] " ...  55 :         totrmsabvgrd | r^2 =    0.285 | p-value =  2.7723e-108"
## [1] " ...  57 :           fireplaces | r^2 =    0.218 | p-value =   6.1415e-80"
## [1] " ...  60 :          garageyrblt | r^2 =    0.237 | p-value =   8.7051e-83"
## [1] " ...  62 :           garagecars | r^2 =    0.410 | p-value =  2.4986e-169"
## [1] " ...  63 :           garagearea | r^2 =    0.389 | p-value =  5.2650e-158"
## [1] " ...  67 :           wooddecksf | r^2 =    0.107 | p-value =   9.8439e-38"
## [1] " ...  68 :          openporchsf | r^2 =    0.115 | p-value =   1.9621e-40"
## [1] " ...  69 :        enclosedporch | r^2 =    0.020 | p-value =   4.9036e-08"
## [1] " ...  70 :           x3ssnporch | r^2 =    0.000 | p-value =   9.2444e-01"
## [1] " ...  71 :          screenporch | r^2 =    0.007 | p-value =   1.6782e-03"
## [1] " ...  72 :             poolarea | r^2 =    0.000 | p-value =           NA"
## [1] " ...  76 :              miscval | r^2 =    0.000 | p-value =   4.2297e-01"
## [1] " ...  77 :               mosold | r^2 =    0.002 | p-value =   7.6128e-02"
## [1] " ...  78 :               yrsold | r^2 =    0.001 | p-value =   2.6941e-01"
## [1] " ...  81 :            saleprice | r^2 =    1.000 | p-value =   0.0000e+00"
## [1] " ...  83 :          total_baths | r^2 =    0.358 | p-value =  2.8268e-142"
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...   Columns to remove - based on visual inspection
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
    
    homes_subset <- subset(homes, select = -c(
        id,
        mssubclass,
        street,
        alley,
        utilities,
        condition2,
        roofmatl,
        centralair,
        bsmtfinsf2,
        lowqualfinsf,
        bsmthalfbath,
        kitchenabvgr,
        x3ssnporch,
        screenporch,
        garagequal,
        garagecond,
        paveddrive,
        poolarea,
        poolqc,
        miscval,
        mosold,
        yrsold))

# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
# ...   Impute NAs to functional value
# ...   -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

    for (i in 2 : (length(homes_subset)))
    {
        if(class(homes_subset[,i]) == "integer" || class(homes_subset[,i]) == "numeric" || class(homes_subset[,i]) == "matrix")
        {
            homes_subset[,i][is.na (homes_subset[,i])] <- min (homes_subset[,i], na.rm = TRUE)
        }
    }
    
    for (i in 2:(length(homes_subset)))
    {
        if(class(homes_subset[,i]) == "character")
        {
            homes_subset[,i][is.na (homes_subset[,i])] <- "None"
        }
    }

    homes_subset$log_saleprice <- log(homes_subset$saleprice)
    
    homes_subset <- subset(homes_subset, select = -c(saleprice, sale_date))
    
    sas_dir <- "~/sas/SASUniversityEdition/myfolders/"
    setwd(sas_dir)
    write.csv (homes_subset, file = "training_set_cleaned.csv", row.names = FALSE)